%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
from chart_studio.plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
project_data = pd.read_csv('train_data.csv')
resource_data = pd.read_csv('resources.csv')
print("Number of data points in train data", project_data.shape)
print('-'*50)
print("The attributes of data :", project_data.columns.values)
print("Number of data points in train data", resource_data.shape)
print(resource_data.columns.values)
resource_data.head(2)
# Let's print the first two rows of the project data
project_data.head(2)
# PROVIDE CITATIONS TO YOUR CODE IF YOU TAKE IT FROM ANOTHER WEBSITE.
# https://matplotlib.org/gallery/pie_and_polar_charts/pie_and_donut_labels.html#sphx-glr-gallery-pie-and-polar-charts-pie-and-donut-labels-py
y_value_counts = project_data['project_is_approved'].value_counts()
print("Number of projects thar are approved for funding ", y_value_counts[1], ", (", (y_value_counts[1]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")
print("Number of projects thar are not approved for funding ", y_value_counts[0], ", (", (y_value_counts[0]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(aspect="equal"))
recipe = ["Accepted", "Not Accepted"]
data = [y_value_counts[1], y_value_counts[0]]
wedges, texts = ax.pie(data, wedgeprops=dict(width=0.5), startangle=-40)
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"),
bbox=bbox_props, zorder=0, va="center")
for i, p in enumerate(wedges):
ang = (p.theta2 - p.theta1)/2. + p.theta1
y = np.sin(np.deg2rad(ang))
x = np.cos(np.deg2rad(ang))
horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
connectionstyle = "angle,angleA=0,angleB={}".format(ang)
kw["arrowprops"].update({"connectionstyle": connectionstyle})
ax.annotate(recipe[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
horizontalalignment=horizontalalignment, **kw)
ax.set_title("Nmber of projects that are Accepted and not accepted")
plt.show()
Observations: 1) FRom the above pie chart, and calculations, we have majority of projects that has been approved for funding and that are 84.85% 2) Projects that are not approved for funding are 15.14% of the total projects
# Let's check for any "Nan" values in our preject_data dataframe
project_data.info()
# From the above data, "teacher_prefix" has "3" "NaN" or Missing values
# Let's replace the "missing values" with the most occuring values in teacher_prefix i.e., mode of the teacher_prefix
project_data['teacher_prefix'].mode()
project_data['teacher_prefix'] = project_data['teacher_prefix'].fillna('Mrs.')
project_data['teacher_prefix'].isna().sum()
# Pandas dataframe groupby count, mean: https://stackoverflow.com/a/19385591/4084039
temp = pd.DataFrame(project_data.groupby("school_state")["project_is_approved"].apply(np.mean)).reset_index()
# if you have data which contain only 0 and 1, then the mean = percentage (think about it)
temp.columns = ['state_code', 'num_proposals']
'''# How to plot US state heatmap: https://datascience.stackexchange.com/a/9620
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = temp['state_code'],
z = temp['num_proposals'].astype(float),
locationmode = 'USA-states',
text = temp['state_code'],
marker = dict(line = dict (color = 'rgb(255,255,255)',width = 2)),
colorbar = dict(title = "% of pro")
) ]
layout = dict(
title = 'Project Proposals % of Acceptance Rate by US States',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)',
),
)
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='us-map-heat-map')
'''
# https://www.csi.cuny.edu/sites/default/files/pdf/administration/ops/2letterstabbrev.pdf
temp.sort_values(by=['num_proposals'], inplace=True)
print("States with lowest % approvals")
print(temp.head(5))
print('='*50)
print("States with highest % approvals")
print(temp.tail(5))
#stacked bar plots matplotlib: https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html
def stack_plot(data, xtick, col2='project_is_approved', col3='total'):
ind = np.arange(data.shape[0])
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, data[col3].values)
p2 = plt.bar(ind, data[col2].values)
plt.ylabel('Projects')
plt.title('Number of projects aproved vs rejected')
plt.xticks(ind, list(data[xtick].values))
plt.legend((p1[0], p2[0]), ('total', 'accepted'))
plt.show()
def univariate_barplots(data, col1, col2='project_is_approved', top=False):
# Count number of zeros in dataframe python: https://stackoverflow.com/a/51540521/4084039
temp = pd.DataFrame(project_data.groupby(col1)[col2].agg(lambda x: x.eq(1).sum())).reset_index()
# Pandas dataframe grouby count: https://stackoverflow.com/a/19385591/4084039
temp['total'] = pd.DataFrame(project_data.groupby(col1)[col2].agg({'total':'count'})).reset_index()['total']
temp['Avg'] = pd.DataFrame(project_data.groupby(col1)[col2].agg({'Avg':'mean'})).reset_index()['Avg']
temp.sort_values(by=['total'],inplace=True, ascending=False)
if top:
temp = temp[0:top]
stack_plot(temp, xtick=col1, col2=col2, col3='total')
print(temp.head(5))
print("="*50)
print(temp.tail(5))
univariate_barplots(project_data, 'school_state', 'project_is_approved', False)
Observation: 1) From the above graph, we can analyse that average number of projects approved with respect to the total project submitted by each state 2) We can see that California (CA) has submitted highest number of projects and also the state that has highest number of projects approved 3) Highest project submitted = 15388 by CA 4) Lowest project submiteed = 80 by VT
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
'won', "won't", 'wouldn', "wouldn't"]
# https://stackoverflow.com/a/47091490/4084039
import re
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
# Combining all the above statemennts
from tqdm import tqdm
preprocessed_essays = []
# tqdm is for printing the status bar
for sentance in tqdm(project_data['teacher_prefix'].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e not in stopwords)
preprocessed_essays.append(sent.lower().strip())
def remove_punct(prefix):
if prefix.endswith('.'):
return prefix[:-1]
return prefix
project_data['clean_prefix'] = project_data['teacher_prefix'].apply(remove_punct)
project_data.drop('teacher_prefix',axis = 1, inplace = True)
univariate_barplots(project_data, 'clean_prefix', 'project_is_approved' , top=False)
Observations: 1) Highest number of projects submitted by the teachers having prefix as Mrs., Ms., followed by Mr. and Teacher, Dr., having 57269, 38955, 10648, 2360 and 13 respectively 2) Least number of projects are submitted by doctors teachers with prefix Dr. having only 13 projects only 3) Maximum projects approved here for Mrs(female teachers with married status) are 48997 and lowest number of projects approved are 9 only.
def clean_project_grade(list_text_feature,df,old_col_name,new_col_name):
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
feature_list = []
for i in list_text_feature:
temp = i.split(' ')
last_dig = temp[-1].split('-')
fin = [temp[0]]
fin.extend(last_dig)
feature = '_'.join(fin)
feature_list.append(feature.strip())
df[new_col_name] = feature_list
df.drop([old_col_name], axis=1, inplace=True)
from collections import Counter
my_counter = Counter()
for word in df[new_col_name].values:
my_counter.update(word.split())
feature_dict = dict(my_counter)
sorted_feature_dict = dict(sorted(feature_dict.items(), key=lambda kv: kv[1]))
return sorted_feature_dict
grade_sorted_grade_dict = clean_project_grade(project_data['project_grade_category'],project_data,'project_grade_category','clean_grade_category')
univariate_barplots(project_data, 'clean_grade_category', 'project_is_approved', top=False)
Observations: 1) maximum projects submitted are came from Grades PreK - 2 and that is 44225 projects, out of which 37536 appeoved (84.87%) 2) Least projects came from Grades 9 - 12 and that is 10963, out of which 9183 are approved (83.76%
)
catogories = list(project_data['project_subject_categories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
cat_list = []
for i in catogories:
temp = ""
# consider we have text like this "Math & Science, Warmth, Care & Hunger"
for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
temp = temp.replace('&','_') # we are replacing the & value into
cat_list.append(temp.strip())
project_data['clean_categories'] = cat_list
project_data.drop(['project_subject_categories'], axis=1, inplace=True)
project_data.head(2)
univariate_barplots(project_data, 'clean_categories', 'project_is_approved', top=20)
Observations: 1) Maximum projects submitted comes from the category of Literacy_Language, 23655 projects and total acceptance rate is 86.74 % accounting about 20520 projects accepted. 2) Least project submitted from AppliedLearning Math_Science categories with only 1052 submission, from that 81.27% projects accepted, around 855.
# count of all the words in corpus python: https://stackoverflow.com/a/22898595/4084039
from collections import Counter
my_counter = Counter()
for word in project_data['clean_categories'].values:
my_counter.update(word.split())
for word in project_data['clean_categories'].values[:50]:
print(word.split())
# dict sort by value python: https://stackoverflow.com/a/613218/4084039
cat_dict = dict(my_counter)
sorted_cat_dict = dict(sorted(cat_dict.items(), key=lambda kv: kv[1]))
ind = np.arange(len(sorted_cat_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(sorted_cat_dict.values()))
plt.ylabel('Projects')
plt.title('% of projects aproved category wise')
plt.xticks(ind, list(sorted_cat_dict.keys()))
plt.show()
Observtions: 1) Above graph shows the % of projects approved from different categories 2) Maximum projects are accepted from Literacy_Language category followed by Math_Science and so on. 3) From Warmth category, Least number of projects ahs been approved, followed Care_Hunger and so on
for i, j in sorted_cat_dict.items():
print("{:20} :{:10}".format(i,j))
# Let's clean the sub_categories first
sub_catogories = list(project_data['project_subject_subcategories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
sub_cat_list = []
for i in sub_catogories:
temp = ""
# consider we have text like this "Math & Science, Warmth, Care & Hunger"
for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
temp +=j.strip()+" "#" abc ".strip() will return "abc", remove the trailing spaces
temp = temp.replace('&','_')
sub_cat_list.append(temp.strip())
project_data['clean_subcategories'] = sub_cat_list
project_data.drop(['project_subject_subcategories'], axis=1, inplace=True)
project_data.head(2)
univariate_barplots(project_data, 'clean_subcategories', 'project_is_approved', top=50)
Obervations: 1) From sub_categories, maximum projects comes from Literacy with total of 9486 projects out of which 8371 projects are approved. 2) Least projects has came from AppliedScience College_CareerPre subcategory that has a total count of projects only 405 and 330 approved.
# count of all the words in corpus python: https://stackoverflow.com/a/22898595/4084039
from collections import Counter
my_counter = Counter()
for word in project_data['clean_subcategories'].values:
my_counter.update(word.split())
# dict sort by value python: https://stackoverflow.com/a/613218/4084039
sub_cat_dict = dict(my_counter)
sorted_sub_cat_dict = dict(sorted(sub_cat_dict.items(), key=lambda kv: kv[1]))
ind = np.arange(len(sorted_sub_cat_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(sorted_sub_cat_dict.values()))
plt.ylabel('Projects')
plt.title('% of projects approved sub_category wise')
plt.xticks(ind, list(sorted_sub_cat_dict.keys()))
plt.show()
Observations: 1) From the graph it is clear that maximum projects approved comes from Literacy sub_catrgories and least comes from economics and Community_Service
for i, j in sorted_sub_cat_dict.items():
print("{:20} :{:10}".format(i,j))
#How to calculate number of words in a string in DataFrame: https://stackoverflow.com/a/37483537/4084039
word_count = project_data['project_title'].str.split().apply(len).value_counts()
word_dict = dict(word_count)
word_dict = dict(sorted(word_dict.items(), key=lambda kv: kv[1]))
ind = np.arange(len(word_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(word_dict.values()))
plt.ylabel('Numeber of projects')
plt.xlabel('Numeber words in project title')
plt.title('Words for each title of the project')
plt.xticks(ind, list(word_dict.keys()))
plt.show()
From the graph we can inferred that, Maximum number of projects are having only 4 words in their titles and least number of them are lengthy with 13 words.
approved_title_word_count = project_data[project_data['project_is_approved']==1]['project_title'].str.split().apply(len)
approved_title_word_count = approved_title_word_count.values
rejected_title_word_count = project_data[project_data['project_is_approved']==0]['project_title'].str.split().apply(len)
rejected_title_word_count = rejected_title_word_count.values
# https://glowingpython.blogspot.com/2012/09/boxplot-with-matplotlib.html
plt.boxplot([approved_title_word_count, rejected_title_word_count])
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Words in project title')
plt.grid()
plt.show()
Observation:From the above graphs we can get comparison between the number of words in projects that are approved V/s projects that are not approved. As we can infered that, the approved projects comes with somewhat more number of words in their titles (about 4 to 7) and (3 to 6) in non approved projects. But the means of them looks like the same. We are not sure about the effect of the number of words in the acceptance rate of the project, But the slight effect might be, if more the descriptive project title of the project there might be higher chances of project acceptance.
plt.figure(figsize=(10,3))
sns.kdeplot(approved_title_word_count,label="Approved Projects", bw=0.6)
sns.kdeplot(rejected_title_word_count,label="Not Approved Projects", bw=0.6)
plt.legend()
plt.show()
Observtion: From the kde plot above, we can assert that there is not much different between the approved and Not approved projects, as the distribution is mostly similar with slight advance for approved project for longer titles. Hence the number of words in ptoject title does not affect on the acceptance rate of the project.
# merge two column text dataframe:
project_data["essay"] = project_data["project_essay_1"].map(str) +\
project_data["project_essay_2"].map(str) + \
project_data["project_essay_3"].map(str) + \
project_data["project_essay_4"].map(str)
approved_word_count = project_data[project_data['project_is_approved']==1]['essay'].str.split().apply(len)
approved_word_count = approved_word_count.values
rejected_word_count = project_data[project_data['project_is_approved']==0]['essay'].str.split().apply(len)
rejected_word_count = rejected_word_count.values
# https://glowingpython.blogspot.com/2012/09/boxplot-with-matplotlib.html
plt.boxplot([approved_word_count, rejected_word_count])
plt.title('Words for each essay of the project')
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Words in project essays')
plt.grid()
plt.show()
Observation: From the box plot above, we can infered that, the effect of number of words of project essay's on the project acceptance rate is minimum. As the approved projects and not approved projects seems to overlaped completely with mean words in approved projects is more than the not approved projects, around 240 and 225 respectively.
plt.figure(figsize=(10,3))
sns.distplot(approved_word_count, hist=False, label="Approved Projects")
sns.distplot(rejected_word_count, hist=False, label="Not Approved Projects")
plt.title('Words for each essay of the project')
plt.xlabel('Number of words in each eassay')
plt.legend()
plt.show()
It is clear from the kde plot above, that there is similar distribution of approved and not approved projects in terms of number of words in project essay's. So the number of words in the essay alon can not give us much sense about the acceptance rate of the project.
# we get the cost of the project using resource.csv file
resource_data.head(2)
# https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)
# join two dataframes in python:
project_data = pd.merge(project_data, price_data, on='id', how='left')
approved_price = project_data[project_data['project_is_approved']==1]['price'].values
rejected_price = project_data[project_data['project_is_approved']==0]['price'].values
# https://glowingpython.blogspot.com/2012/09/boxplot-with-matplotlib.html
plt.boxplot([approved_price, rejected_price])
plt.title('Box Plots of Cost per approved and not approved Projects')
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Price')
plt.grid()
plt.show()
Observation: From the above box plot, the cost per project for both approved and not approved projects is nearly equal(little more for not approved projects) but it seems that the mean cost per project is also same. The number of outliers in approved projects are more than not approved projects.
plt.figure(figsize=(10,3))
sns.distplot(approved_price, hist=False, label="Approved Projects")
sns.distplot(rejected_price, hist=False, label="Not Approved Projects")
plt.title('Cost per approved and not approved Projects')
plt.xlabel('Cost of a project')
plt.legend()
plt.show()
It is more clear from the above kde plot, that we have nearly similar distribution for cost per project for both categories and acceptance rate is nearly independent of cost per project.
# http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
#If you get a ModuleNotFoundError error , install prettytable using: pip3 install prettytable
x = PrettyTable()
x.field_names = ["Percentile", "Approved Projects", "Not Approved Projects"]
for i in range(0,101,5):
x.add_row([i,np.round(np.percentile(approved_price,i), 3), np.round(np.percentile(rejected_price,i), 3)])
print(x)
temp = pd.DataFrame(project_data.groupby('teacher_number_of_previously_posted_projects')["project_is_approved"].apply(np.mean)).reset_index()
univariate_barplots(project_data, 'teacher_number_of_previously_posted_projects', 'project_is_approved', False)
Observation: 1) It is clear from the figure that, the more emphasis is given for new teachers that have submitted the projects for the firs time, as we can see most of the teachers are new and they haven't submitted the projects previously. 2) Total 30014 teachers have submitted the projects for the first time out of which 24652 (nearly 82%) are accepted, followed by teacher which have submitted the project once in the past and so on. 3) If the teacher has appiled minimum number of times previously, its chances of project acceptance is more. 4) As the number of previously applied teachers becomes more, there chances of acceptance is drastically less.
teacher_count = project_data['teacher_number_of_previously_posted_projects'].value_counts()
teacher_dict = dict(teacher_count)
teacher_dict = dict(sorted(teacher_dict.items(), key=lambda kv: kv[1]))
ind = np.arange(len(teacher_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(teacher_dict.values()))
plt.ylabel('Numeber of projects')
plt.xlabel('teacher_number_of_previously_posted_projects')
plt.title('Number of teachers for previousl posted projects')
plt.xticks(ind, list(teacher_dict.keys()))
plt.show()
From the graph above, it is clear that the more the number teacher of previously posted projects increases there is a drastic decline in the acceptance rate of the project.
resource_summaries = list(project_data['project_resource_summary'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
res_list = []
for i in resource_summaries:
temp = ""
# consider we have text like this "Math & Science, Warmth, Care & Hunger"
for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
# j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
# temp = temp.replace('&','_') # we are replacing the & value into
res_list.append(temp.strip())
project_data['clean_project_resource_summary'] = res_list
project_data.drop(['project_resource_summary'], axis=1, inplace=True)
project_data.head(2)
univariate_barplots(project_data, 'clean_project_resource_summary' , 'project_is_approved', top=50)
Observation: From the resource summary, it is clear that the more economical is the project, the more is the acceptance rate. As we can see the largest approved project resource summary(40 accepted out of 48) requires only a simple electronics tablets for all the there requirements as compared to other resources which requires more costly requirements as chromebook, amazon echo dot and so on.
approved_word_count = project_data[project_data['project_is_approved']==1]['clean_project_resource_summary'].str.split().apply(len)
approved_word_count = approved_word_count.values
rejected_word_count = project_data[project_data['project_is_approved']==0]['clean_project_resource_summary'].str.split().apply(len)
rejected_word_count = rejected_word_count.values
plt.figure(figsize=(10,3))
sns.distplot(approved_word_count, hist=False, label="Approved Projects")
sns.distplot(rejected_word_count, hist=False, label="Not Approved Projects")
plt.title('Words for each resource summary of the project')
plt.xlabel('Number of words in each resource summary')
plt.legend()
plt.show()
As evident from the graph above, the number of words in the resource summary does not affect on the acceptance rate of the project.
# https://glowingpython.blogspot.com/2012/09/boxplot-with-matplotlib.html
plt.boxplot([approved_word_count, rejected_word_count])
plt.title('Words for each resourec summary of the project')
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Words in project resource summary')
plt.grid()
plt.show()
Again, from the above box plot, it is clear that the number of words in the resource summary does not affect on acceptance rate. Because all of them have nearly same number of words distribution in the resource summaries.
#How to calculate number of words in a string in DataFrame: https://stackoverflow.com/a/37483537/4084039
word_count = project_data['clean_project_resource_summary'].str.split().apply(len).value_counts()
word_dict = dict(word_count)
word_dict = dict(sorted(word_dict.items(), key=lambda kv: kv[1]))
ind = np.arange(len(word_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(word_dict.values()))
plt.ylabel('Numeber of projects')
plt.xlabel('Numeber words in project resource summary')
plt.title('Words for each resource summary of the project')
plt.xticks(ind, list(word_dict.keys()))
plt.show()
# check for the effect of the numerical digit in the resourec_project_summary on the acceptance of the project
with_digit_approved = []
without_digit_approved = []
with_digit_not_approved = []
without_digit_not_approved = []
for index in range(109248):
flag = False
for word in project_data['clean_project_resource_summary'].loc[index].split():
if word.isdigit():
if project_data['project_is_approved'].loc[index] == 1:
with_digit_approved.append(project_data['clean_project_resource_summary'].loc[index])
else:
with_digit_not_approved.append(project_data['clean_project_resource_summary'].loc[index])
flag = True
break
if not flag:
if project_data['project_is_approved'].loc[index] == 1:
without_digit_approved.append(project_data['clean_project_resource_summary'].loc[index])
else:
without_digit_not_approved.append(project_data['clean_project_resource_summary'].loc[index])
total_projects = len(with_digit_approved) + len(without_digit_approved) + len(with_digit_not_approved) + len(without_digit_not_approved)
print("Total percentage of projects with resource having digits approved:",len(with_digit_approved) / total_projects * 100,"%")
print("Total percentage of projects with resource having digits not approved:",len(with_digit_not_approved) / total_projects * 100,"%")
print("Total percentage of projects with resource not having digits approved:",len(without_digit_approved) / total_projects * 100,"%")
print("Total percentage of projects with resource not having digits not approved:",len(without_digit_not_approved) / total_projects * 100,"%")
print("Total projects summaries with digits:", (len(with_digit_approved) + len(with_digit_not_approved)) / total_projects * 100,"%")
print("Total projects summaries without digits:", (len(without_digit_approved) + len(without_digit_not_approved)) / total_projects * 100,"%")
Observation: From the above analysis, total number of project summaries that contains digits are about 10.4% approximately, out of which about 90% of the summary projects has been approved and only 10% has been rejected. So we can infered that, having digits inside the project summaries help to assess the impact of the project being submitted and plays a significant part in project acceptance rate.
project_data.head(2)
# printing some random essays.
print(project_data['essay'].values[0])
print("="*50)
print(project_data['essay'].values[150])
print("="*50)
print(project_data['essay'].values[1000])
print("="*50)
print(project_data['essay'].values[20000])
print("="*50)
print(project_data['essay'].values[99999])
print("="*50)
# https://stackoverflow.com/a/47091490/4084039
import re
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
sent = decontracted(project_data['essay'].values[20000])
print(sent)
print("="*50)
# \r \n \t remove from string python: http://texthandler.com/info/remove-line-breaks-python/
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
print(sent)
#remove spacial character: https://stackoverflow.com/a/5843547/4084039
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
print(sent)
# Combining all the above statemennts
from tqdm import tqdm
preprocessed_essays = []
# tqdm is for printing the status bar
for sentance in tqdm(project_data['essay'].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e not in stopwords)
preprocessed_essays.append(sent.lower().strip())
# after preprocesing
preprocessed_essays[20000]
# using the above code for pre-processing project titles
from tqdm import tqdm
preprocessed_titles = []
# tqdm is for printing the status bar
for title in tqdm(project_data['project_title'].values):
tit_le = decontracted(title)
tit_le = tit_le.replace('\\r', ' ')
tit_le = tit_le.replace('\\"', ' ')
tit_le = tit_le.replace('\\n', ' ')
tit_le = re.sub('[^A-Za-z0-9]+', ' ', tit_le)
# https://gist.github.com/sebleier/554280
tit_le = ' '.join(e for e in tit_le.split() if e not in stopwords)
preprocessed_titles.append(tit_le.lower().strip())
project_data['clean_project_title'] = preprocessed_titles
project_data.drop(['project_title'], axis=1, inplace=True)
project_data.head(2)
project_data.columns
we are going to consider
teacher_prefix : categorical data
project_title : text data
project_resource_summary: text data
quantity : numerical
# we use count vectorizer to convert the values into one hot encoded features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=list(sorted_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_categories'].values)
print(vectorizer.get_feature_names())
categories_one_hot = vectorizer.transform(project_data['clean_categories'].values)
print("Shape of matrix after one hot encodig ",categories_one_hot.shape)
# we use count vectorizer to convert the values into one hot encoded features
vectorizer = CountVectorizer(vocabulary=list(sorted_sub_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_subcategories'].values)
print(vectorizer.get_feature_names())
sub_categories_one_hot = vectorizer.transform(project_data['clean_subcategories'].values)
print("Shape of matrix after one hot encodig ",sub_categories_one_hot.shape)
# Please do the similar feature encoding with state, teacher_prefix and project_grade_category also
# using count vectorizer for one-hot encoding of state
vectorizer = CountVectorizer(vocabulary=set(project_data['school_state'].values), lowercase=False, binary=True)
vectorizer.fit(project_data['school_state'].values)
print(vectorizer.get_feature_names())
school_state_one_hot = vectorizer.transform(project_data['school_state'].values)
print("Shape of matrix after one hot encodig ",school_state_one_hot.shape)
# using count vectorizer for one-hot encoding of teacher_prefix
# vectorizer = CountVectorizer(vocabulary=set(project_data['teacher_prefix'].values), lowercase=False, binary=True)
# vectorizer.fit(project_data['teacher_prefix'].values)
# print(vectorizer.get_feature_names())
# teacher_prefix_one_hot = vectorizer.transform(project_data['teacher_prefix'].values)
# print("Shape of matrix after one hot encodig ",teacher_prefix_one_hot.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(project_data['clean_prefix'].values)
feature_one_hot = vectorizer.transform(project_data['clean_prefix'].values)
feature_one_hot
# using count vectorizer for one-hot encoding of project_grade_category
vectorizer = CountVectorizer(vocabulary=set(project_data['clean_grade_category'].values), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_grade_category'].values)
print(vectorizer.get_feature_names())
project_category_one_hot = vectorizer.transform(project_data['clean_grade_category'].values)
print("Shape of matrix after one hot encodig ",project_category_one_hot.shape)
# using count vectorizer for one-hot encoding of teacher_prefix as clean_prefix
vectorizer = CountVectorizer(vocabulary=set(project_data['clean_prefix'].values), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_prefix'].values)
print(vectorizer.get_feature_names())
teacher_prefix_one_hot = vectorizer.transform(project_data['clean_prefix'].values)
print("Shape of matrix after one hot encodig ",project_category_one_hot.shape)
# We are considering only the words which appeared in at least 10 documents(rows or projects).
vectorizer = CountVectorizer(min_df=10)
text_bow = vectorizer.fit_transform(preprocessed_essays)
print("Shape of matrix after one hot encodig ",text_bow.shape)
# Lets vectorize project_titles
vectorizer = CountVectorizer(min_df=10)
title_text_bow = vectorizer.fit_transform(preprocessed_titles)
print("Shape of matrix after one hot encodig ",title_text_bow.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10)
text_tfidf = vectorizer.fit_transform(preprocessed_essays)
print("Shape of matrix after one hot encodig ",text_tfidf.shape)
# TFIDF vectorization for project_title
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10)
title_text_tfidf = vectorizer.fit_transform(preprocessed_titles)
print("Shape of matrix after one hot encodig ",title_text_tfidf.shape)
'''
# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039
def loadGloveModel(gloveFile):
print ("Loading Glove Model")
f = open(gloveFile,'r', encoding="utf8")
model = {}
for line in tqdm(f):
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model = loadGloveModel('glove.42B.300d.txt')
# ============================
Output:
Loading Glove Model
1917495it [06:32, 4879.69it/s]
Done. 1917495 words loaded!
# ============================
words = []
for i in preproced_texts:
words.extend(i.split(' '))
for i in preproced_titles:
words.extend(i.split(' '))
print("all the words in the coupus", len(words))
words = set(words)
print("the unique words in the coupus", len(words))
inter_words = set(model.keys()).intersection(words)
print("The number of words that are present in both glove vectors and our coupus", \
len(inter_words),"(",np.round(len(inter_words)/len(words)*100,3),"%)")
words_courpus = {}
words_glove = set(model.keys())
for i in words:
if i in words_glove:
words_courpus[i] = model[i]
print("word 2 vec length", len(words_courpus))
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
import pickle
with open('glove_vectors', 'wb') as f:
pickle.dump(words_courpus, f)
'''
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
# make sure you have the glove_vectors file
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
# average Word2Vec
# compute average word2vec for each review.
avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
avg_w2v_vectors.append(vector)
print(len(avg_w2v_vectors))
print(len(avg_w2v_vectors[0]))
# Similarly you can vectorize for title also
# average Word2Vec
# compute average word2vec for each review/ project_titles
titles_avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_titles): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
titles_avg_w2v_vectors.append(vector)
print(len(titles_avg_w2v_vectors))
print(len(titles_avg_w2v_vectors[0]))
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
# average Word2Vec
# compute average word2vec for each review.
tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_vectors.append(vector)
print(len(tfidf_w2v_vectors))
print(len(tfidf_w2v_vectors[0]))
# Similarly you can vectorize for title also
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_titles)
# we are converting a dictionary with word as a key, and the idf as a value
titles_dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
titles_tfidf_words = set(tfidf_model.get_feature_names())
# average Word2Vec
# compute average word2vec for each review.
titles_tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_titles): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in titles_tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = titles_dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
titles_tfidf_w2v_vectors.append(vector)
print(len(titles_tfidf_w2v_vectors))
print(len(titles_tfidf_w2v_vectors[0]))
# check this one: https://www.youtube.com/watch?v=0HOqOcln3Z4&t=530s
# standardization sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler
# price_standardized = standardScalar.fit(project_data['price'].values)
# this will rise the error
# ValueError: Expected 2D array, got 1D array instead: array=[725.05 213.03 329. ... 399. 287.73 5.5 ].
# Reshape your data either using array.reshape(-1, 1)
price_scalar = StandardScaler()
price_scalar.fit(project_data['price'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {price_scalar.mean_[0]}, Standard deviation : {np.sqrt(price_scalar.var_[0])}")
# Now standardize the data with above maen and variance.
price_standardized = price_scalar.transform(project_data['price'].values.reshape(-1, 1))
price_standardized
# Standardize the the teacher_number_of_previously_posted_projects
teacher_number_scalar = StandardScaler()
teacher_number_scalar.fit(project_data['teacher_number_of_previously_posted_projects'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {teacher_number_scalar.mean_[0]}, Standard deviation : {np.sqrt(teacher_number_scalar.var_[0])}")
# Now standardize the data with above maen and variance.
teacher_number_standardized = teacher_number_scalar.transform(project_data['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1))
teacher_number_standardized
we need to merge all the numerical vectors i.e catogorical, text, numerical vectors
Let's describe the number of features we have
print(categories_one_hot.shape)
print(sub_categories_one_hot.shape)
print(school_state_one_hot.shape)
print(teacher_prefix_one_hot.shape)
print(project_category_one_hot.shape)
print(price_standardized.shape)
print(teacher_number_standardized.shape)
print(text_bow.shape)
print(title_text_bow.shape)
print(text_tfidf.shape)
print(title_text_tfidf.shape)
print(np.asarray(avg_w2v_vectors).shape)
print(np.asarray(titles_avg_w2v_vectors).shape)
print(np.asarray(tfidf_w2v_vectors).shape)
print(np.asarray(titles_tfidf_w2v_vectors).shape)
from scipy.sparse import hstack
X_whole = hstack((categories_one_hot, sub_categories_one_hot,school_state_one_hot,teacher_prefix_one_hot,project_category_one_hot, price_standardized,teacher_number_standardized,text_bow,\
title_text_bow,text_tfidf, title_text_tfidf, avg_w2v_vectors,titles_avg_w2v_vectors,tfidf_w2v_vectors,titles_tfidf_w2v_vectors))
X_whole.shape
# Let's select top 5000 data points for t-sne plot
sample_categories = categories_one_hot[:5000]
sample_subcategories = sub_categories_one_hot[:5000]
sample_school_state = school_state_one_hot[:5000]
sample_teacher_prefix = teacher_prefix_one_hot[:5000]
sample_project_cat = project_category_one_hot[:5000]
sample_price = price_standardized[:5000]
sample_teacher_number = teacher_number_standardized[:5000]
sample_text_bow = text_bow[:5000]
sample_title_text_bow = title_text_bow[:5000]
sample_text_tfidf = text_tfidf[:5000]
sample_title_text_tfidf = title_text_tfidf[:5000]
sample_avg_w2v_vectors = np.asarray(avg_w2v_vectors)[:5000]
sample_titles_avg_w2v_vectors = np.asarray(titles_avg_w2v_vectors)[:5000]
sample_tfidf_w2v_vectors = np.asarray(tfidf_w2v_vectors)[:5000]
sample_titles_tfidf_w2v_vectors = np.asarray(titles_tfidf_w2v_vectors)[:5000]
# Let's prepare data matrix for Categorical, Numerical + project_title(BOW) data
X_Bow = hstack((sample_categories, sample_subcategories,sample_school_state,sample_teacher_prefix,sample_project_cat, sample_price,sample_teacher_number,\
sample_title_text_bow))
Y = project_data['project_is_approved'][:5000]
np.asarray(Y).reshape(-1,1)
# import t-sne
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=50, random_state = 0, n_iter = 5000)
X_embedding = tsne.fit_transform(X_Bow.toarray())
Y = project_data['project_is_approved'][:5000]
# if x is a sparse matrix you need to pass it as X_embedding = tsne.fit_transform(x.toarray()) , .toarray() will convert the sparse matrix into dense matrix
for_tsne = np.hstack((X_embedding, np.asarray(Y).reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dim_x','Dim_y','label'])
sns.FacetGrid(for_tsne_df,hue='label',size=6).map(plt.scatter,'Dim_x','Dim_y').add_legend()
plt.title("T-SNE plot for Bow encoding of project_title_feature")
plt.show()
# Let's prepare data matrix for Categorical, Numerical + project_title(BOW) data
X_TFIDF = hstack((sample_categories, sample_subcategories,sample_school_state,sample_teacher_prefix,sample_project_cat, sample_price,sample_teacher_number,\
sample_title_text_tfidf))
# import t-sne
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=50, random_state = 0, n_iter = 5000)
X_embedding = tsne.fit_transform(X_TFIDF.toarray())
Y = project_data['project_is_approved'][:5000]
# if x is a sparse matrix you need to pass it as X_embedding = tsne.fit_transform(x.toarray()) , .toarray() will convert the sparse matrix into dense matrix
for_tsne = np.hstack((X_embedding, np.asarray(Y).reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dim_x','Dim_y','label'])
sns.FacetGrid(for_tsne_df,hue='label',size=6).map(plt.scatter,'Dim_x','Dim_y').add_legend()
plt.title("T-SNE plot for Bow encoding of project_title_feature")
plt.show()
# Let's prepare data matrix for Categorical, Numerical + project_title(BOW) data
X_AVG_W2V = hstack((sample_categories, sample_subcategories,sample_school_state,sample_teacher_prefix,sample_project_cat, sample_price,sample_teacher_number,\
sample_titles_avg_w2v_vectors))
# import t-sne
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=50, random_state = 0, n_iter = 5000)
X_embedding = tsne.fit_transform(X_AVG_W2V.toarray())
Y = project_data['project_is_approved'][:5000]
# if x is a sparse matrix you need to pass it as X_embedding = tsne.fit_transform(x.toarray()) , .toarray() will convert the sparse matrix into dense matrix
for_tsne = np.hstack((X_embedding, np.asarray(Y).reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dim_x','Dim_y','label'])
sns.FacetGrid(for_tsne_df,hue='label',size=6).map(plt.scatter,'Dim_x','Dim_y').add_legend()
plt.title("T-SNE plot for Bow encoding of project_title_feature")
plt.show()
# Let's prepare data matrix for Categorical, Numerical + project_title(BOW) data
X_TFIDF_W2V = hstack((sample_categories, sample_subcategories,sample_school_state,sample_teacher_prefix,sample_project_cat, sample_price,sample_teacher_number,\
sample_titles_tfidf_w2v_vectors))
# import t-sne
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=50, random_state = 0, n_iter = 5000)
X_embedding = tsne.fit_transform(X_TFIDF_W2V.toarray())
Y = project_data['project_is_approved'][:5000]
# if x is a sparse matrix you need to pass it as X_embedding = tsne.fit_transform(x.toarray()) , .toarray() will convert the sparse matrix into dense matrix
for_tsne = np.hstack((X_embedding, np.asarray(Y).reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dim_x','Dim_y','label'])
sns.FacetGrid(for_tsne_df,hue='label',size=6).map(plt.scatter,'Dim_x','Dim_y').add_legend()
plt.title("T-SNE plot for Bow encoding of project_title_feature")
plt.show()